#LOAD LIBRARIES
import numpy as np
import pandas as pd
import os
import json
import descartes
import fiona
import folium
import datetime
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import re
import geopandas as gpd
from shapely.geometry import Point, Polygon
%matplotlib inline
#if getting error message then perform the following
#conda install -c conda-forge geopandas
#conda install -c conda-forge pyproj
#conda install -c conda-forge rtree=0.9.3
#LOAD DATA
#https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j
df = pd.read_csv('NYC_Restaurant_Inspection_Results.csv')
#KEEP RELEVANT VARIABLES
df.drop(['INSPECTION TYPE','VIOLATION CODE','VIOLATION DESCRIPTION','PHONE','BIN','BBL','NTA','Community Board','Council District','Census Tract','RECORD DATE'],axis=1,inplace=True)
#DATA CLEANING & FEATURE ENGINEERING
#DATA CLEANING
#Replace spaces with underscores in column names
df.columns=df.columns.str.replace(" ","_")
#Clean up some of the names
df["DBA"].replace("'","", inplace=True) # Remove apostrophe
df["DBA"].replace(" ?\(.+\)", "", regex=True, inplace=True) # Remove values in parenthesis
df["DBA"].replace(" ?#.*", "", regex=True, inplace=True) # Remove # followed by some string
#Changing dates to datetime
df.INSPECTION_DATE=pd.to_datetime(df.INSPECTION_DATE, format="%m/%d/%Y")
df.GRADE_DATE=pd.to_datetime(df.GRADE_DATE, format="%m/%d/%Y")
#Cleaning some column values for analysis
df.replace(to_replace =["Basque", "Nuts/Confectionary", "Fruits/Vegetables", "Not Listed/Not Applicable", "English", "Bottled beverages, including water, sodas, juices, etc.","Juice, Smoothies, Fruit Salads"],
value ="Other Rest", inplace=True)
df.replace(to_replace =["Latin American (Central and South America)"],value ="Latin American", inplace=True)
df.replace(to_replace =["Sandwiches/Salads/Mixed Buffet","Soups/Salads/Sandwiches"],value ="Sandwiches", inplace=True)
df.replace(to_replace =["Chinese/Cuban"],value ="Chinese", inplace=True)
df.replace(to_replace =["Hotdogs/Pretzels"],value ="Hotdogs", inplace=True)
df.replace(to_replace =["American"],value ="American Rest", inplace=True)
df.replace(to_replace =["Asian"],value ="Asian Rest", inplace=True)
df.replace(to_replace =["African"],value ="African Rest", inplace=True)
df.replace(to_replace =["Middle Eastern"],value ="Middle Eastern Rest", inplace=True)
#Feature Engineering(Category, Unique Key, Count, Tooltip)
df['category']=df['CUISINE_DESCRIPTION'].apply(lambda x:x.replace('Chinese','Asian').replace('Pizza','Pizza/Sandwich')
.replace('Italian','European').replace('Japanese','Asian').replace('Mexican','Latin American')
.replace('Bakery','Bakery/Desert').replace('Spanish','European').replace('Sandwiches','Pizza/Sandwich')
.replace('Chicken','American').replace('Indian','Asian').replace('Hamburgers','American')
.replace('Donuts','Bakery/Desert').replace('Jewish/Kosher','Other').replace('Korean','Asian')
.replace('Thai','Asian').replace('Juice, Smoothies, Fruit Salads','Other').replace('French','European')
.replace('Mediterranean','African/Middle Eastern').replace('Tex-Mex','Other').replace('Irish','European')
.replace('Ice Cream, Gelato, Yogurt, Ices','Bakery/Desert').replace('Seafood','Other').replace('Bagels/Pretzels','Bakery/Desert')
.replace('Peruvian','Latin American').replace('Sandwiches/Salads/Mixed Buffet','Pizza/Sandwich').replace('Eastern European','European')
.replace('African Rest','African/Middle Eastern').replace('Vietnamese/Cambodian/Malaysia','Asian').replace('Greek','European')
.replace('Steak','Other').replace('Sandwiches','Pizza/Sandwich').replace('Soul Food','Other')
.replace('Barbecue','Other').replace('Salads','Pizza/Sandwich').replace('Russian','European')
.replace('Turkish','African/Middle Eastern').replace('Bangladeshi','Asian').replace('Chinese/Japanese','Asian')
.replace('Tapas','Caribbean').replace('Hawaiian','American').replace('Continental','American')
.replace('Vegan','Other').replace('Creole','Other').replace('Polish','European')
.replace('Brazilian','Latin American').replace('Filipino','Asian').replace('Pakistani','Asian')
.replace('Pancakes/Waffles','Bakery/Desert').replace('Ethiopian','African/Middle Eastern').replace('Moroccan','African/Middle Eastern')
.replace('Egyptian','African/Middle Eastern').replace('Creole/Cajun','Other').replace('German','European')
.replace('Portuguese','European').replace('Soups','Pizza/Sandwich').replace('Afghan','Asian')
.replace('Californian','American').replace('Southwestern','American').replace('Iranian','African/Middle Eastern')
.replace('Czech','European').replace('Chilean','Latin American').replace('Scandinavian','European').replace('Indonesian','Asian')
.replace('American Rest','American').replace('Café/Coffee/Tea','Cafe').replace('Caribbean (Inc. Cuban, Puerto Rican)','Caribbean')
.replace('Cajun','Other').replace('Other/Cajun','Other').replace('Other/Other','Other')
.replace('Asian/Asian','Asian').replace('Middle Eastern Rest','African/Middle Eastern')
.replace('Cakes, Cupcakes, Desserts','Bakery/Desert').replace('Cakes, Cupcakes, Desserts','Bakery/Desert').replace('Vegetarian','Other')
.replace('Asian Rest','Asian').replace('Other Rest','Other').replace('Hotdogs','Other').replace('Australian','Other').replace('Bagels','Bakery/Desert'))
df['KEY'] = df[['DBA', 'BUILDING', 'STREET', 'ZIPCODE']].astype(str).apply(lambda x: ' '.join(x), axis=1)
df['COUNT'] = 1
df['TOOLTIP'] = df['KEY'] + ' ;CUISINE : '+ df['CUISINE_DESCRIPTION'] +' ;SCORE : ' + df['SCORE'].astype(str)
#FILTERING DATA AND PREPARING IT FOR ANALYSIS
#Keeping only 'Manhattan' data from Jan'16 - Mar'20
nyc_rest=df[(df.INSPECTION_DATE > '2015-12-31') & (df.INSPECTION_DATE < '2020-04-01') & (df.BORO == 'Manhattan')]
#print("num ratings: {} num unique restaurants: {}".format(len(nyc_rest), len(nyc_rest.KEY.unique())))
#Removing Missing Values
nyc_rest = nyc_rest[nyc_rest['GRADE_DATE'].notna()] # Dropping records where Grade_Date are NA
nyc_rest = nyc_rest[nyc_rest['Latitude'].notna()] # Dropping records where Latitude are NA
#print("num ratings: {} num unique restaurants: {}".format(len(nyc_rest), len(nyc_rest.KEY.unique())))
#Identify Latest Inspection Data and Keep Only those records
nyc_rest_int1=nyc_rest.groupby("CAMIS").GRADE_DATE.max() #fetching latest grade date
nyc_rest_int2= pd.DataFrame(nyc_rest_int1).reset_index()
nyc_rest_int2.CAMIS.value_counts().sort_values()
nyc_rest = pd.merge(nyc_rest,nyc_rest_int2,how='inner',on=['CAMIS','GRADE_DATE'])
#nyc_rest[nyc_rest.CAMIS==50050805]
#Exploratory Data Analysis
print ("Rows : " ,nyc_rest.shape[0])
print ("Columns : " ,nyc_rest.shape[1])
print ("\nFeatures : \n" ,nyc_rest.columns.tolist())
print ("\nMissing values : \n", nyc_rest.isna().sum())
print ("\nUnique values : \n",nyc_rest.nunique())
#Question 1 : Identify most hygenic(no violations) and least hygenic restaurants in Manhattan
#A) Restaurants with no violations (Most hygenic)
nyc_rest_best = nyc_rest[(nyc_rest['ACTION'] == 'No violations were recorded at the time of this inspection.')]
nyc_rest_best.reset_index(inplace=True)
nyc_rest_best = nyc_rest_best.drop(columns='index')
#B) Least hygenic restaurants
nyc_rest_worst = nyc_rest[nyc_rest.SCORE > np.percentile(nyc_rest.SCORE,99)]
nyc_rest_worst.reset_index(inplace=True)
nyc_rest_worst = nyc_rest_worst.drop(columns='index')
# Mapping most (green) and least(red) hygenic restaurants on map with tooltips
new_york = folium.Map(
location=[40.75,-73.92], #[40.7128,-73.9352]
zoom_start=11.75
)
for i in range(len(nyc_rest_best)):
folium.Marker([nyc_rest_best['Latitude'][i], nyc_rest_best['Longitude'][i]],popup=nyc_rest_best['KEY'][i],tooltip=nyc_rest_best['TOOLTIP'][i], icon=folium.Icon(color='green')).add_to(new_york)
for i in range(len(nyc_rest_worst)):
folium.Marker([nyc_rest_worst['Latitude'][i], nyc_rest_worst['Longitude'][i]],popup=nyc_rest_worst['KEY'][i],tooltip=nyc_rest_worst['TOOLTIP'][i], icon=folium.Icon(color='red')).add_to(new_york)
new_york
#Question 2 : Identify best and worst restaurants by cuisine. See if anything surprises you.
#Identify Grade A restaurants with no critical violations (best)
nyc_hygenic= nyc_rest[(nyc_rest['GRADE'] == 'A') & (nyc_rest['CRITICAL_FLAG'] == 'N')]
nyc_hygenic.reset_index(inplace=True)
nyc_hygenic = nyc_hygenic.drop(columns='index')
#Identify Grade B and C restaurants with critical violations (worst)
nyc_not_hygenic= nyc_rest[((nyc_rest['GRADE'] == 'B') | (nyc_rest['GRADE'] == 'C')) & (nyc_rest['CRITICAL_FLAG'] == 'Y')]
nyc_not_hygenic.reset_index(inplace=True)
nyc_not_hygenic = nyc_not_hygenic.drop(columns='index')
#Creates the sunburst interactive chart. If you click on the inner circle then you will get details
plt.figure(figsize=(40,40));
df1 = px.data.tips()
fig = px.sunburst(nyc_hygenic, path=['category', 'CUISINE_DESCRIPTION'], values='COUNT',title='Most Hygenic Restaurants (GRADE A)')
fig.update_layout(title_x=0.5)
fig.show()
plt.figure(figsize=(40,40));
df1 = px.data.tips()
fig = px.sunburst(nyc_not_hygenic, path=['category', 'CUISINE_DESCRIPTION'], values='COUNT', title='Non Hygenic Restaurants (GRADE B or C)')
fig.update_layout(title_x=0.5)
fig.show()
# Question 3 : Identify how inspection and grading have changed for Manhattan restaurants over time (2017 - 2019) for different cuisines.
# Inspection details for different cuisines over time
df_manh=df[(df.INSPECTION_DATE > '2016-12-31') & (df.INSPECTION_DATE < '2020-01-01') & (df.BORO == 'Manhattan')]
df_inspection = df_manh[['CAMIS','INSPECTION_DATE','category','COUNT']]
df_inspection = df_inspection.drop_duplicates()
df_inspection['YEAR'] = pd.DatetimeIndex(df_inspection['INSPECTION_DATE']).year
df_inspection.reset_index(inplace=True)
df_inspection = df_inspection.drop(columns='index')
# Number of inspections by year
df_inspection_yr = df_inspection.groupby(['category','YEAR']).sum()['COUNT'].reset_index()
df_camis_yr = df_inspection.groupby(['category','YEAR']).CAMIS.nunique().reset_index()
df_manh_inspection = pd.merge(df_inspection_yr,df_camis_yr,how='inner',on=['category','YEAR'])
df_manh_inspection['AVG_INSPECTIONS'] = df_manh_inspection['COUNT']/df_manh_inspection['CAMIS']
# Renaming columns
df_manh_inspection = df_manh_inspection.rename(columns={ 'CAMIS': 'INSPECTED RESTAURANTS','COUNT': '# OF INSPECTIONS'})
# Grading details for different cuisines over time
df_graded=df[(df.INSPECTION_DATE > '2016-12-31') & (df.INSPECTION_DATE < '2020-01-01') & (df.BORO == 'Manhattan')]
df_graded = df_graded[df_graded['GRADE_DATE'].notna()] # Dropping records where Grade_Date are NA
df_graded = df_graded[df_graded['Latitude'].notna()] # Dropping records where Latitude are NA
df_graded = df_graded[['CAMIS','GRADE_DATE','category','COUNT']]
df_graded['YEAR'] = pd.DatetimeIndex(df_graded['GRADE_DATE']).year
df_graded.reset_index(inplace=True)
df_graded = df_graded.drop(columns='index')
# Number of violations by year
df_graded_viol_yr = df_graded.groupby(['category','YEAR']).sum()['COUNT'].reset_index()
df_camis_viol_yr = df_graded.groupby(['category','YEAR']).CAMIS.nunique().reset_index()
df_manh_grade_viol = pd.merge(df_graded_viol_yr,df_camis_viol_yr,how='inner',on=['category','YEAR'])
df_manh_grade_viol['AVG_VIOLATIONS'] = df_manh_grade_viol['COUNT']/df_manh_grade_viol['CAMIS']
# Renaming columns
df_manh_grade_viol = df_manh_grade_viol.rename(columns={ 'CAMIS': 'GRADED RESTAURANTS','COUNT': '# OF VIOLATIONS'})
#Joining the above tables for final analysis
df_manh_analysis = pd.merge(df_manh_inspection,df_manh_grade_viol,how='inner',on=['category','YEAR'])
df_manh_analysis.YEAR = df_manh_analysis.YEAR.astype('str')
df_manh_analysis
#sns.set(rc={'figure.figsize':(11.7,8.27)})
g=sns.FacetGrid(data=df_manh_analysis, col = 'category', margin_titles=True)
g.map(plt.bar,'YEAR','AVG_VIOLATIONS');
g1=sns.FacetGrid(data=df_manh_analysis, col = 'category', margin_titles=True)
g1.map(plt.bar,'YEAR','AVG_INSPECTIONS')